Still under constructions.
(III) Detailed List
- Read and load each line of source code of all the 250 movies.
- Add Content Rating, Number of Rater, Genre, Budget, Opening Weekend USA, Gross USA and Cumulative Worldwide Gross by reading each movie’s link.
- The data was collected on 2020-10-30.
# get source code of a single movie
h_get_movie_source_code = function(curr_movie_link) {
curr_movie_source_code = curr_movie_link %>%
readLines(encoding = "UTF-8")
return(curr_movie_source_code)
}
# get the actual release date of the movie
get_release_date_from_movie_source_code = function(movie_source_code) {
release_date_pattern = "Release Date:"
release_date_line = movie_source_code %>%
grep(pattern = release_date_pattern)
release_date = movie_source_code %>%
extract(release_date_line) %>%
str_split("h4> ") %>% sapply(extract, 2)
return(release_date)
}
get_poster_from_movie_source_code = function(movie_source_code, output_width) {
poster_start_pattern = "<div class=\"poster\">"
poster_end_pattern = "</a> </div>"
poster_start_line = movie_source_code %>%
grep(pattern = poster_start_pattern)
lines_with_poster_end_pattern = movie_source_code %>%
grep(pattern = poster_end_pattern)
poster_end_line = lines_with_poster_end_pattern %>%
extract(lines_with_poster_end_pattern %>%
is_greater_than(poster_start_line) %>%
which() %>%
extract(1))
poster = movie_source_code %>%
extract(poster_start_line : poster_end_line) %>%
extract(3 : 4) %>%
paste(collapse = " ") %>%
str_split("> ") %>% sapply(extract, 2) %>%
str_split(" /") %>% sapply(extract, 1) %>%
paste0(" width=\"", output_width, "\">")
return(poster)
}
p = c()
for (i in 1:3) {
curr_movie_sc = h_get_movie_source_code(m_link[i])
curr_poster = get_poster_from_movie_source_code(curr_movie_sc, 75)
p = p %>% c(curr_poster)
}
p_df = tibble(p)
p_df %>%
kable(align = "c", escape = FALSE) %>%
kable_styling(bootstrap_options = c("striped",
"hover",
"responsive"),
fixed_thead = TRUE,
full_width = FALSE) %>%
row_spec(0:3, extra_css = "vertical-align: middle;")
rm(p, i, p_df)
# get basic info json from the single movie source code
h_get_basics_from_movie_source_code = function(movie_source_code) {
json_start_pattern = "<script type=\"application/ld\\+json\">\\{"
json_end_pattern = "\\}</script>"
json_start_line = movie_source_code %>%
grep(pattern = json_start_pattern)
json_end_line = movie_source_code %>%
grep(pattern = json_end_pattern) %>%
extract(1)
json_file = movie_source_code %>%
extract(json_start_line : json_end_line)
return(json_file)
}
h_existence_checking = function(basics, curr_pattern) {
existence_check = basics %>%
extract(basics %>% grep(pattern = curr_pattern)) %>%
length() %>%
is_greater_than(0)
return(existence_check)
}
h_get_genre = function(basics) {
genre_pattern = "genre"
existence_check = h_existence_checking(basics, genre_pattern)
if (existence_check %>% equals(FALSE)) {
return("")
}
genre = basics %>%
extract(basics %>% grep(pattern = genre_pattern)) %>%
str_split(": \"") %>% sapply(extract, 2) %>%
str_split("\"") %>% sapply(extract, 1)
return(genre)
}
h_get_content_rating = function(basics) {
content_rating_pattern = "contentRating"
existence_check = h_existence_checking(basics, content_rating_pattern)
if (existence_check %>% equals(FALSE)) {
return("")
}
content_rating = basics %>%
extract(basics %>% grep(pattern = content_rating_pattern)) %>%
str_split(": \"") %>% sapply(extract, 2) %>%
str_split("\"") %>% sapply(extract, 1)
return(content_rating)
}
h_get_rating_count = function(basics) {
rating_count_pattern = "ratingCount"
existence_check = h_existence_checking(basics, rating_count_pattern)
if (existence_check %>% equals(FALSE)) {
return("")
}
rating_count = basics %>%
extract(basics %>% grep(pattern = rating_count_pattern)) %>%
str_split(": ") %>% sapply(extract, 2) %>%
str_split(",") %>% sapply(extract, 1)
return(rating_count)
}
h_get_rating_value = function(basics) {
rating_value_pattern = "ratingValue"
existence_check = h_existence_checking(basics, rating_value_pattern)
if (existence_check %>% equals(FALSE)) {
return("")
}
rating_value = basics %>%
extract(basics %>% grep(pattern = rating_value_pattern)) %>%
extract(1) %>%
str_split(": \"") %>% sapply(extract, 2) %>%
str_split("\"") %>% sapply(extract, 1)
return(rating_value)
}
h_get_date_published = function(basics) {
date_published_pattern = "datePublished"
existence_check = h_existence_checking(basics, date_published_pattern)
if (existence_check %>% equals(FALSE)) {
return("")
}
date_published = basics %>%
extract(basics %>% grep(pattern = date_published_pattern)) %>%
str_split(": \"") %>% sapply(extract, 2) %>%
str_split("\"") %>% sapply(extract, 1)
return(date_published)
}
h_get_basics_info = function(basics) {
curr_genre = h_get_genre(basics)
curr_content_rating = h_get_content_rating(basics)
curr_rating_count = h_get_rating_count(basics)
curr_rating_value = h_get_rating_value(basics)
curr_date_published = h_get_date_published(basics)
return(c(curr_genre,
curr_content_rating,
curr_rating_count,
curr_rating_value,
curr_date_published))
}
# get box office info from the single movie source code
h_get_box_office_from_movie_source_code = function(movie_source_code) {
box_office_start_pattern = "<h3 class=\"subheading\">Box Office</h3>"
box_office_end_pattern = "<hr />"
box_office_start_line = movie_source_code %>%
grep(pattern = box_office_start_pattern)
lines_with_box_office_end_pattern = movie_source_code %>%
grep(pattern = box_office_end_pattern)
box_office_end_line = lines_with_box_office_end_pattern %>%
extract(lines_with_box_office_end_pattern %>%
is_greater_than(box_office_start_line) %>%
which() %>%
extract(1))
box_office = movie_source_code %>%
extract(box_office_start_line : box_office_end_line)
return(box_office)
}
curr_source_code = m_link[1] %>%
h_get_movie_source_code()
curr_basics = curr_source_code %>%
h_get_basics_from_movie_source_code()
curr_box_office = curr_source_code %>%
h_get_box_office_from_movie_source_code()
curr_release_date = curr_source_code %>%
get_release_date_from_movie_source_code()
curr_basics_info = h_get_basics_info(curr_basics)
curr_release_date
[1] "14 October 1994 (USA)"
curr_basics_info
[1] "Drama" "R" "2299252" "9.3" "1994-09-23"
curr_basics %>% cat()
<script type="application/ld+json">{ "@context": "http://schema.org", "@type": "Movie", "url": "/title/tt0111161/", "name": "The Shawshank Redemption", "image": "https://m.media-amazon.com/images/M/MV5BMDFkYTc0MGEtZmNhMC00ZDIzLWFmNTEtODM1ZmRlYWMwMWFmXkEyXkFqcGdeQXVyMTMxODk2OTU@._V1_.jpg", "genre": "Drama", "contentRating": "R", "actor": [ { "@type": "Person", "url": "/name/nm0000209/", "name": "Tim Robbins" }, { "@type": "Person", "url": "/name/nm0000151/", "name": "Morgan Freeman" }, { "@type": "Person", "url": "/name/nm0348409/", "name": "Bob Gunton" }, { "@type": "Person", "url": "/name/nm0006669/", "name": "William Sadler" } ], "director": { "@type": "Person", "url": "/name/nm0001104/", "name": "Frank Darabont" }, "creator": [ { "@type": "Person", "url": "/name/nm0000175/", "name": "Stephen King" }, { "@type": "Person", "url": "/name/nm0001104/", "name": "Frank Darabont" }, { "@type": "Organization", "url": "/company/co0040620/" } ], "description": "The Shawshank Redemption is a movie starring Tim Robbins, Morgan Freeman, and Bob Gunton. Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.", "datePublished": "1994-09-23", "keywords": "wrongful imprisonment,based on the works of stephen king,prison,escape from prison,voice over narration", "aggregateRating": { "@type": "AggregateRating", "ratingCount": 2299252, "bestRating": "10.0", "worstRating": "1.0", "ratingValue": "9.3" }, "review": { "@type": "Review", "itemReviewed": { "@type": "CreativeWork", "url": "/title/tt0111161/" }, "author": { "@type": "Person", "name": "alexkolokotronis" }, "dateCreated": "2008-02-18", "inLanguage": "English", "name": "This is How Movies Should Be Made", "reviewBody": "This movie is not your ordinary Hollywood flick. It has a great and deep message. This movie has a foundation and just kept on being built on from their and that foundation is hope.\n\nOther than just the message of this movie the acting was phenomenal. Tim Robbins gave one of the greatest performances ever. He was inspiring, intelligent and most of all positive. His performance just made me smile. Robbins plays Andy Dufresne who was wrongfully convicted of murdering his wife and her lover. He is gets to life sentences but yet never gives up hope. In he becomes friends with Ellis Boyd \"Red\" Redding played by Morgan Freeman. Freeman who gives the finest performance of his career has unlike Robbins lost hope. He is in deep regret of the crime that he committed. His way of deflecting the pain away is by trying to not feel anything at all. With his friendship with Andy he learns that without our hopes and dreams we have nothing. Andy also becomes friends with the rest of Red\u0027s group. James Whitmore also gave a great performance as Brooks Halten who gets out of prison parole but in the words of Red he has been \"institutionalized\". \n\nThe directing by Frank Darabont was just magnificent. He kept this movie at a great steady pace along with the writing and great cinematography. He portrayed prison life in such a horrifying way, but not in terms of the physical pain but the stress and pain that wares mentally on the inmates, some of which deserve a second chance. \n\nWhatever you do, don\u0027t listen to the people who say this movie is overrated because this is one of the most inspiring and greatest movies ever. It has everything you could possibly want.", "reviewRating": { "@type": "Rating", "worstRating": "1", "bestRating": "10", "ratingValue": "10" } }, "duration": "PT2H22M", "trailer": { "@type": "VideoObject", "name": "Official Trailer", "embedUrl": "/video/imdb/vi3877612057", "thumbnail": { "@type": "ImageObject", "contentUrl": "https://m.media-amazon.com/images/M/MV5BNjQ2NDA3MDcxMF5BMl5BanBnXkFtZTgwMjE5NTU0NzE@._V1_.jpg" }, "thumbnailUrl": "https://m.media-amazon.com/images/M/MV5BNjQ2NDA3MDcxMF5BMl5BanBnXkFtZTgwMjE5NTU0NzE@._V1_.jpg", "description": "Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.", "uploadDate": "2014-03-05T14:13:19Z" } }</script>
curr_box_office %>% cat()
<h3 class="subheading">Box Office</h3> <div class="txt-block"> <h4 class="inline">Budget:</h4>$25,000,000 <span class="attribute">(estimated)</span> </div> <div class="txt-block"> <h4 class="inline">Opening Weekend USA:</h4> $727,327, <span class="attribute">25 September 1994</span> </div> <div class="txt-block"> <h4 class="inline">Gross USA:</h4> $28,699,976 </div> <div class="txt-block"> <h4 class="inline">Cumulative Worldwide Gross:</h4> $28,815,291 </div> <span class="see-more inline"> <a href="https://pro.imdb.com/title/tt0111161?rf=cons_tt_bo_tt&ref_=cons_tt_bo_tt" >See more on IMDbPro</a> » </span> <hr />
| Title |
h1 itemprop="name" |
| Year |
Next line of Title |
| Content Rating |
meta itemprop="contentRating" |
| User Rating |
span itemprop="ratingValue" |
| Number of Rater |
itemprop="ratingCount" |
| Genre |
span class="itemprop" itemprop="genre" |
| Budget |
<h4 class="inline">Budget |
| Opening Weekend USA ($) |
<h4 class="inline">Opening Weekend USA |
| Gross USA ($) |
<h4 class="inline">Gross |
| Cumulative Worldwide Gross ($) |
<h4 class="inline">Cumulative |
#Design function to get target information from a single page
#Each input is a website link from `movie_link`
get.target.info=function(input){
temp=readLines(con=input,encoding="UTF-8")
# 1. title
temp.movie_title=temp[grep("h1 itemprop=\"name\"",temp)]
temp.movie_title=strsplit(temp.movie_title,split=">")[[1]][2]
temp.movie_title=strsplit(temp.movie_title,split="&")[[1]][1]
#2. year
temp.movie_year=temp[grep("h1 itemprop=\"name\"",temp)+1]
temp.movie_year=strsplit(temp.movie_year,split=">")[[1]][2]
temp.movie_year=strsplit(temp.movie_year,split="<")[[1]][1]
#3. content rating
temp.movie_content_rating=temp[grep("meta itemprop=\"contentRating\"",temp)]
if (length(temp.movie_content_rating)==1){
temp.movie_content_rating=strsplit(temp.movie_content_rating,split=">")[[1]][2]
}
if (length(temp.movie_content_rating)==0){
temp.movie_content_rating="-"
}
#4. user rating
temp.movie_user_rating=temp[grep("span itemprop=\"ratingValue\"",temp)]
temp.movie_user_rating=strsplit(temp.movie_user_rating,split=">")[[1]][3]
temp.movie_user_rating=strsplit(temp.movie_user_rating,split="<")[[1]][1]
#5. number of rater
temp.movie_num_rater=temp[grep("itemprop=\"ratingCount\"",temp)]
temp.movie_num_rater=strsplit(temp.movie_num_rater,split=">")[[1]][3]
temp.movie_num_rater=strsplit(temp.movie_num_rater,split="<")[[1]][1]
#6. genre
temp.movie_genre=temp[grep("span class=\"itemprop\" itemprop=\"genre\"",temp)]
temp.movie_genre.l=length(temp.movie_genre)
for (i in 1:temp.movie_genre.l){
temp.movie_genre[[i]]=strsplit(temp.movie_genre,split=">")[[i]][3]
temp.movie_genre[[i]]=strsplit(temp.movie_genre,split="<")[[i]][1]
}
remove(i,temp.movie_genre.l)
temp.movie_genre=paste(temp.movie_genre,collapse=", ")
#7. budget
temp.movie_budget=temp[grep("<h4 class=\"inline\">Budget",temp)]
if (length(temp.movie_budget)==1){
temp.movie_budget=strsplit(temp.movie_budget,split=">")[[1]][3]
a=strsplit(temp.movie_budget,split="")[[1]]
if (paste(a[1],a[2],a[3],sep="")=="FRF"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],sep="")=="JPY"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],sep="")=="INR"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],sep="")=="DEM"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],sep="")=="RUR"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],sep="")=="TRL"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],sep="")=="AUD"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],sep="")=="KRW"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],a[4],a[5],a[6],sep="")=="€"){
temp.movie_budget=paste("EUR",substr(temp.movie_budget,start=7,stop=nchar(temp.movie_budget)))
}
if (paste(a[1],a[2],a[3],a[4],a[5],a[6],a[7],sep="")=="£"){
temp.movie_budget=paste("GBP",substr(temp.movie_budget,start=8,stop=nchar(temp.movie_budget)))
}
remove(a)
}
if (length(temp.movie_budget)==0){
temp.movie_budget="-"
}
#8. opening
temp.movie_opening=temp[grep("<h4 class=\"inline\">Opening Weekend USA",temp)]
if (length(temp.movie_opening)==1){
temp.movie_opening=strsplit(temp.movie_opening,split=">")[[1]][3]
temp.movie_opening=strsplit(temp.movie_opening,split=" ")[[1]][2]
a=strsplit(temp.movie_opening,split="")[[1]]
if (a[length(a)]==","){
temp.movie_opening=substr(temp.movie_opening,start=1,stop=nchar(temp.movie_opening)-1)
}
remove(a)
}
if (length(temp.movie_opening)==0){
temp.movie_opening="-"
}
#9. gross
temp.movie_gross=temp[grep("<h4 class=\"inline\">Gross",temp)]
if (length(temp.movie_gross)==1){
temp.movie_gross=strsplit(temp.movie_gross,split=">")[[1]][3]
temp.movie_gross=strsplit(temp.movie_gross,split=" ")[[1]][2]
a=strsplit(temp.movie_gross,split="")[[1]]
if (a[length(a)]==","){
temp.movie_gross=substr(temp.movie_gross,start=1,stop=nchar(temp.movie_gross)-1)
}
remove(a)
}
if (length(temp.movie_gross)==0){
temp.movie_gross="-"
}
#10. worldwide gross
temp.movie_worldwide_gross=temp[grep("<h4 class=\"inline\">Cumulative",temp)]
if (length(temp.movie_worldwide_gross)==1){
temp.movie_worldwide_gross=strsplit(temp.movie_worldwide_gross,split=">")[[1]][3]
temp.movie_worldwide_gross=strsplit(temp.movie_worldwide_gross,split=" ")[[1]][2]
a=strsplit(temp.movie_worldwide_gross,split="")[[1]]
if (a[length(a)]==","){
temp.movie_worldwide_gross=substr(temp.movie_worldwide_gross,start=1,stop=nchar(temp.movie_worldwide_gross)-1)
}
remove(a)
}
if (length(temp.movie_worldwide_gross)==0){
temp.movie_worldwide_gross="-"
}
#11. result
return(c(temp.movie_title,temp.movie_year,temp.movie_content_rating,temp.movie_user_rating,temp.movie_num_rater,temp.movie_genre,temp.movie_budget,temp.movie_opening,temp.movie_gross,temp.movie_worldwide_gross))
}
#Collecting data
movie_title=c()
movie_year=c()
movie_content_rating=c()
movie_user_rating=c()
movie_num_rater=c()
movie_genre=c()
movie_budget=c()
movie_opening=c()
movie_gross=c()
movie_worldwide_gross=c()
for (i in 1:250){
temp.target.info=get.target.info(movie_link[i])
movie_title=c(movie_title,temp.target.info[1])
movie_year=c(movie_year,temp.target.info[2])
movie_content_rating=c(movie_content_rating,temp.target.info[3])
movie_user_rating=c(movie_user_rating,temp.target.info[4])
movie_num_rater=c(movie_num_rater,temp.target.info[5])
movie_genre=c(movie_genre,temp.target.info[6])
movie_budget=c(movie_budget,temp.target.info[7])
movie_opening=c(movie_opening,temp.target.info[8])
movie_gross=c(movie_gross,temp.target.info[9])
movie_worldwide_gross=c(movie_worldwide_gross,temp.target.info[10])
}
#Visualization
library(knitr)
y=data.frame(movie_rank,movie_title,movie_year,movie_content_rating,movie_user_rating,movie_num_rater,movie_genre,movie_budget,movie_opening,movie_gross,movie_worldwide_gross)
y$movie_rank=as.character(movie_rank)
y$movie_title=as.character(movie_title)
y$movie_year=as.character(movie_year)
y$movie_content_rating=as.character(movie_content_rating)
y$movie_user_rating=as.character(movie_user_rating)
y$movie_num_rater=as.character(movie_num_rater)
y$movie_genre=as.character(movie_genre)
y$movie_budget=as.character(movie_budget)
y$movie_opening=as.character(movie_opening)
y$movie_gross=as.character(movie_gross)
y$movie_worldwide_gross=as.character(movie_worldwide_gross)
kable(y,align="c",col.names=c("Rank","Title","Year","Content Rating","User Rating","Number of Rater","Genre","Budget","Opening Weekend USA","Gross USA","Cumulative Worldwide Gross"))